library(fields)
library(readr)
library(tidyverse, warn.conflicts = FALSE)
library(data.table)
library(dplyr, warn.conflicts = FALSE)
library(ggplot2)
library(tidyr)
library(mvtnorm)
library(ParBayesianOptimization)
library(cowplot)
library(tibble)
source("SCRIPTS/outlier_detection_helper.R")
head(best_combos |> select(-likelihood.grid, -likelihood.CNM, -sigma.squared.grid, -sigma.squared.CNM), 20)
## EIN year nu.grid nugget.grid nu.CNM nugget.CNM
## 1 EIN-04-2503758 0 0.190 0.1 0.1342 0.0000
## 2 EIN-04-2503758 1 0.190 0.2 0.1281 0.0000
## 3 EIN-04-2503758 2 0.190 0.2 0.1146 0.0000
## 4 EIN-04-2503758 3 0.190 0.2 0.1183 0.0000
## 5 EIN-04-2503758 4 0.190 0.1 0.1358 0.0000
## 6 EIN-04-2503758 6 0.190 0.0 0.1788 0.0000
## 7 EIN-04-2503758 7 0.355 0.0 0.4086 0.0000
## 8 EIN-04-2503758 8 0.190 0.0 0.1835 0.0000
## 9 EIN-04-2503758 9 0.190 0.2 0.1230 0.0000
## 10 EIN-04-2503758 10 0.190 0.2 0.1269 0.0000
## 11 EIN-04-2503758 11 0.190 0.2 0.1060 0.0000
## 12 EIN-04-2503758 12 0.190 0.3 0.0887 0.0000
## 13 EIN-04-2503758 13 0.190 0.2 0.1194 0.0000
## 14 EIN-13-2720896 0 0.520 0.0 1.2678 0.0051
## 15 EIN-13-2720896 1 0.685 0.0 0.9150 0.0045
## 16 EIN-13-2720896 2 0.685 0.0 1.0762 0.0046
## 17 EIN-13-2720896 3 0.520 0.0 1.2658 0.0040
## 18 EIN-13-2720896 4 0.520 0.0 1.2709 0.0041
## 19 EIN-13-2720896 5 0.520 0.0 1.3145 0.0039
## 20 EIN-13-2720896 6 0.520 0.0 1.2140 0.0043
print(paste("Out of ", nrow(best_combos), " org-year pairs, constrained Nelder-Mead got the higher likelihood (compared to grid search) for ", sum(best_combos$likelihood.grid <= best_combos$likelihood.CNM), " pairs.", sep = ""))
## [1] "Out of 600 org-year pairs, constrained Nelder-Mead got the higher likelihood (compared to grid search) for 596 pairs."
Ranges (Bayesian Optimization)
Ranges (Grid Search Optimization)
df <- as.data.table(read_csv("small_test.csv", show_col_types = FALSE))
# center the data
df <- df |>
group_by(EIN2) |>
mutate(LOG_REV = LOG_REV - mean(LOG_REV)) |>
ungroup()
# distance matrix of years
dist_mat <- as.matrix(dist(sort(unique(df$YEAR)), diag=TRUE, upper=TRUE))
# normalize
dist_mat <- (dist_mat - min(dist_mat)) / (max(dist_mat) - min(dist_mat))
# Set up ranges for Matern covariance matrix hyperparameters
rho <- 1
# All possible combinations of hyperparameters
combos <- expand.grid(nu = seq(from = 0.025, to = 2.5, by = 0.165),
nugget = seq(from = 0, to = 2.5, by = 0.1))
res <- readRDS("MODEL/outlier_detection/constrainNM_results.rds")
rownames(res) <- res[,"EIN"]
res <- res |> mutate(sigma.squared = 0)
all_predictions = list()
all_candidates = list()
all_se = list()
mu_1 <- 0
# Looping over all organizations
start.time <- Sys.time()
for (ein in unique(df$EIN2)){
df.sub <- filter(df, EIN2==ein)
all_years <- df.sub$YEAR + 1 # all the years we have data for that org
mu_2 <- numeric(nrow(df.sub) - 1) # all zeros
predictions <- numeric(length(all_years)) # initialize
standard_errors <- numeric(length(all_years)) # initialize
# Generate Matern covariance matrix using "best" hyperparameters
mat_cov <- Matern(d = dist_mat,
smoothness = res[ein,]$nu,
range = rho,
phi = 1) + diag(res[ein,]$nugget, dim(dist_mat)[1])
sigma.squared <- (1/length(df.sub$LOG_REV)) * (df.sub$LOG_REV %*% solve(mat_cov[all_years, all_years]) %*% df.sub$LOG_REV)[1,1]
mat_cov <- sigma.squared * mat_cov
res[ein,]$sigma.squared <- sigma.squared
for(i in 1:length(all_years)){
# Leave out i-th year
log_revenue <- (df |> filter(EIN2 == ein & YEAR != all_years[i]-1))$LOG_REV
SIGMA.11 <- mat_cov[all_years[i], all_years[i]]
SIGMA.22.inv <- solve(mat_cov[all_years[-i], all_years[-i]])
SIGMA.12 <- mat_cov[all_years[i], all_years[-i], drop = FALSE]
predictions[i] <- mu_1 + (SIGMA.12 %*% SIGMA.22.inv %*% (log_revenue - mu_2)) #mu_bar
standard_errors[i] <- sqrt(as.numeric(SIGMA.11 - (SIGMA.12 %*% SIGMA.22.inv %*% t(SIGMA.12))))
}
residuals <- abs(((df |> filter(EIN2 == ein))$LOG_REV - predictions) / standard_errors) # standardized residuals
all_predictions[[ein]] <- predictions
all_se[[ein]] <- standard_errors
all_candidates[[ein]] <- all_years[which(residuals > 3)] - 1 + 1989
p <- plot_true_vs_pred(df = df,
ein = ein,
predictions = predictions,
standard_errors = standard_errors,
all_years = all_years,
candidate_outliers = all_candidates[ein],
combo = res[ein,], label = ein)
print(p)
}
print(Sys.time() - start.time)
df <- as.data.table(read_csv("small_test.csv", show_col_types = FALSE))
# center the data
df <- df |>
group_by(EIN2) |>
mutate(LOG_REV = LOG_REV - mean(LOG_REV)) |>
ungroup()
# distance matrix of years
dist_mat <- dist(sort(unique(df$YEAR)), diag=TRUE, upper=TRUE)
dist_mat <- as.matrix(dist_mat)
# normalize
dist_mat <- (dist_mat - min(dist_mat)) / (max(dist_mat) - min(dist_mat))
bounds <- list(nu = c(0.025,2.5), nugget = c(0,2.5), sigma.squared = c(0.025, 1.5))
opt_info <- list()
start.time <- Sys.time() # about two minutes per org / took about 50 minutes for whole dataset
for (ein in unique(df$EIN2)){
print(ein)
df.subset <- df |> filter(EIN2 == ein)
# need to define different wrapper function per org to pass to bayesOpt since data and years differs org to org
get_likelihood_bayesopt_wrapper <- function(nu, nugget, sigma.squared) {
return(list(Score = get_likelihood_bayesopt(distance.matrix = dist_mat,
data = df.subset$LOG_REV,
years = df.subset$YEAR + 1,
nu, nugget, sigma.squared)))
}
opt_info[[ein]] <- bayesOpt(FUN = get_likelihood_bayesopt_wrapper,
bounds = bounds,
initPoints = 5,
iters.n = 50,
plotProgress = FALSE,
verbose = 0)
}
print(Sys.time() - start.time)
all_predictions = list()
all_candidates = list()
all_se = list()
all_best_params = list()
mu_1 <- 0
# Looping over all organizations
start.time <- Sys.time()
for (ein in unique(df$EIN2)){
all_years <- (df |> filter(EIN2 == ein))$YEAR + 1 # all the years we have data for that org
mu_2 <- numeric(nrow(df |> filter(EIN2 == ein)) - 1) # all zeros
predictions <- numeric(length(all_years)) # initialize
standard_errors <- numeric(length(all_years)) # initialize
best.pars <- getBestPars(opt_info[[ein]])
all_best_params[[ein]] <- best.pars
# Generate Matern covariance matrix using "best" hyperparameters
mat_cov <- Matern(d = dist_mat,
smoothness = best.pars$nu,
range = 1,
phi = best.pars$sigma.squared) + diag(best.pars$nugget, dim(dist_mat)[1])
# Train different model per org leave-one-out style
for(i in 1:length(all_years)){
# Leave out i-th year
log_revenue <- (df |> filter(EIN2 == ein & YEAR != all_years[i]-1))$LOG_REV
SIGMA.11 <- mat_cov[all_years[i], all_years[i]]
SIGMA.22.inv <- solve(mat_cov[all_years[-i], all_years[-i]])
SIGMA.12 <- mat_cov[all_years[i], all_years[-i], drop = FALSE]
predictions[i] <- mu_1 + (SIGMA.12 %*% SIGMA.22.inv %*% (log_revenue - mu_2)) #mu_bar
standard_errors[i] <- sqrt(as.numeric(SIGMA.11 - (SIGMA.12 %*% SIGMA.22.inv %*% t(SIGMA.12))))
}
residuals <- abs(((df |> filter(EIN2 == ein))$LOG_REV - predictions) / standard_errors) # standardized residuals
all_predictions[[ein]] <- predictions
all_se[[ein]] <- standard_errors
all_candidates[[ein]] <- all_years[which(residuals > 3)] - 1 + 1989
p <- plot_true_vs_pred(df, ein, predictions, standard_errors, all_years, all_candidates[ein], best.pars)
print(p)
}
print(Sys.time() - start.time)
df <- as.data.table(read_csv("small_test.csv", show_col_types = FALSE))
# center the data
df <- df |>
group_by(EIN2) |>
mutate(LOG_REV = LOG_REV - mean(LOG_REV)) |>
ungroup()
# distance matrix of years
dist_mat <- dist(sort(unique(df$YEAR)), diag=TRUE, upper=TRUE)
dist_mat <- as.matrix(dist_mat)
# normalize
min_val <- min(dist_mat)
max_val <- max(dist_mat)
dist_mat <- (dist_mat - min_val) / (max_val - min_val)
# Set up ranges for Matern covariance matrix hyperparameters
rho <- 1
nu_vec <- seq(from = 0.05, to = 2.5, by = 0.245)
nugget_vec <- seq(from = 0, to = 2, by = 0.2)
sigma_squared_vec <- seq(from = 0.05, to = 0.95, by = 0.1)
# All possible combinations of hyperparameters
combos <- expand.grid(nu = nu_vec, nugget = nugget_vec, sigma.squared = sigma_squared_vec)
# Grid Search: Looping over all organizations to find best set of hyperparameters
max_likelihoods = list()
max_likelihoods_idx = list()
all_likelihoods = list()
start.time <- Sys.time()
for (ein in unique(df$EIN2)){
all_years <- (df |> filter(EIN2 == ein))$YEAR + 1 # all the years we have data for that org
# get the likelihood for each combination of parameters
likelihoods <- apply(combos,
MARGIN = 1,
function(row) return(get_likelihood(row,
dist_mat = dist_mat,
rho = rho,
all_years = all_years,
data = (df |> filter(EIN2 == ein))$LOG_REV))
)
all_likelihoods[[ein]] <- likelihoods
max_likelihoods[[ein]] <- max(likelihoods)
max_likelihoods_idx[[ein]] <- which.max(likelihoods)
}
print(Sys.time() - start.time)
best_combos <- combos[as.vector(unlist(max_likelihoods_idx)),]
row.names(best_combos) <- names(max_likelihoods_idx)
best_combos
all_predictions = list()
all_candidates = list()
all_se = list()
mu_1 <- 0
# Looping over all organizations
start.time <- Sys.time()
for (ein in unique(df$EIN2)){
all_years <- (df |> filter(EIN2 == ein))$YEAR + 1 # all the years we have data for that org
mu_2 <- numeric(nrow(df |> filter(EIN2 == ein)) - 1) # all zeros
predictions <- numeric(length(all_years)) # initialize
standard_errors <- numeric(length(all_years)) # initialize
# Generate Matern covariance matrix using "best" hyperparameters
mat_cov <- Matern(d = dist_mat,
smoothness = best_combos[ein,1],
range = rho,
phi = best_combos[ein,3]) + diag(best_combos[ein,2], dim(dist_mat)[1])
for(i in 1:length(all_years)){
# Leave out i-th year
log_revenue <- (df |> filter(EIN2 == ein & YEAR != all_years[i]-1))$LOG_REV
SIGMA.11 <- mat_cov[all_years[i], all_years[i]]
SIGMA.22.inv <- solve(mat_cov[all_years[-i], all_years[-i]])
SIGMA.12 <- mat_cov[all_years[i], all_years[-i], drop = FALSE]
predictions[i] <- mu_1 + (SIGMA.12 %*% SIGMA.22.inv %*% (log_revenue - mu_2)) #mu_bar
standard_errors[i] <- sqrt(as.numeric(SIGMA.11 - (SIGMA.12 %*% SIGMA.22.inv %*% t(SIGMA.12))))
}
residuals <- abs(((df |> filter(EIN2 == ein))$LOG_REV - predictions) / standard_errors) # standardized residuals
all_predictions[[ein]] <- predictions
all_se[[ein]] <- standard_errors
all_candidates[[ein]] <- all_years[which(residuals > 3)] - 1 + 1989
p <- plot_true_vs_pred(df, ein, predictions, standard_errors, all_years, all_candidates[ein], best_combos[ein,])
print(p)
}
print(Sys.time() - start.time)